if (!require("pacman")) {
install.packages("pacman")
library("pacman")
}
pacman::p_load(
"tidyverse",
"readxl",
"janitor",
"scales",
"knitr",
"kableExtra",
"lubridate",
"fastDummies",
"mlogit",
"gmnl",
"ChoiceModelR"
)
This report documents our CBC analysis of consumer preferences for OTT streaming platforms. It covers the full pipeline: data exploration and cleaning, MNL and HB-MNL estimation, predictive validation, part-worth utilities, WTP, and market simulations.
| Design Element | Specification |
|---|---|
| Conjoint Method | Choice-Based Conjoint (CBC) |
| Design Type | Random (computer-generated via Discover) |
| Choice Tasks per Respondent | 10 |
| Alternatives per Task | 3 streaming profiles + 1 “None” option |
| Holdout Tasks | 1 (fixed, for validation) |
attr_df <- data.frame(
Attribute = c(
rep("Monthly Price", 5), rep("Ads", 3), rep("Video Quality", 3),
rep("Simultaneous Streams", 3), rep("Content Type", 3)
),
Level = c(
"€4.99", "€7.99", "€9.99", "€12.99", "€14.99",
"With Ads", "Limited Ads (before content)", "Ad-free",
"HD (720p)", "Full HD (1080p)", "4K + HDR",
"1 screen", "2 screens", "4 screens",
"Licensed Content Only", "Originals & Licensed", "Exclusive Originals Only"
),
Code = c(
"1", "2", "3", "4", "5",
"1", "2", "3",
"1", "2", "3",
"1", "2", "3",
"1", "2", "3"
)
)
attr_df %>%
kable(caption = "CBC Attributes, Levels, and Design Codes") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
collapse_rows(columns = 1, valign = "top")
| Attribute | Level | Code |
|---|---|---|
| Monthly Price | €4.99 | 1 |
| €7.99 | 2 | |
| €9.99 | 3 | |
| €12.99 | 4 | |
| €14.99 | 5 | |
| Ads | With Ads | 1 |
| Limited Ads (before content) | 2 | |
| Ad-free | 3 | |
| Video Quality | HD (720p) | 1 |
| Full HD (1080p) | 2 | |
| 4K + HDR | 3 | |
| Simultaneous Streams | 1 screen | 1 |
| 2 screens | 2 | |
| 4 screens | 3 | |
| Content Type | Licensed Content Only | 1 |
| Originals & Licensed | 2 | |
| Exclusive Originals Only | 3 |
Full factorial: \(5 \times 3 \times 3 \times 3 \times 3 = 405\) possible profiles.
# Respondent-level survey data
raw_all <- read_excel("CPM Survey - Respondent Data.xlsx", sheet = "All Data")
raw_utils <- read_excel("CPM Survey - Respondent Data.xlsx", sheet = "Rescaled - CBC - Random")
raw_imports <- read_excel("CPM Survey - Respondent Data.xlsx", sheet = "Importances - CBC - Random")
# Long-format CBC design with attribute codes and choices
design_raw <- read_excel("CPM Survey - CBC - Random Design & choices.xlsx",
sheet = "Design & Choices")
data.frame(
Source = c("All Data", "Rescaled Utilities", "Importances", "Design & Choices"),
Rows = c(nrow(raw_all), nrow(raw_utils), nrow(raw_imports), nrow(design_raw)),
Columns = c(ncol(raw_all), ncol(raw_utils), ncol(raw_imports), ncol(design_raw)),
Description = c(
"Survey responses (demographics, usage, CBC choices, segments)",
"Individual-level HB rescaled part-worth utilities",
"Individual-level attribute importance scores",
"Long-format experimental design with attribute codes and responses"
)
) %>%
kable(caption = "Data Sources Overview") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Source | Rows | Columns | Description |
|---|---|---|---|
| All Data | 107 | 40 | Survey responses (demographics, usage, CBC choices, segments) |
| Rescaled Utilities | 106 | 20 | Individual-level HB rescaled part-worth utilities |
| Importances | 106 | 6 | Individual-level attribute importance scores |
| Design & Choices | 4200 | 9 | Long-format experimental design with attribute codes and responses |
Row 1 of “All Data” contains survey question text (not a respondent). We remove it and clean variables.
survey <- raw_all[-1, ] %>%
rename(
record_id = `Record ID`,
status = `Respondent Status`,
start_time = `Start Time (UTC)`,
end_time = `End Time (UTC)`,
gender = Gender,
age = Age,
employment = `Employment Status`,
education = Education,
income = Income,
ott_use = `Use of OTT?`,
n_subscriptions = `Number of Subscriptions`,
platform_netflix = `Platforms currently in use_Netflix`,
platform_disney = `Platforms currently in use_Disney+`,
platform_amazon = `Platforms currently in use_Amazon Prime`,
platform_hbo = `Platforms currently in use_HBO Max`,
platform_apple = `Platforms currently in use_Apple TV+`,
platform_other = `Platforms currently in use_Other(s):`,
spending = Spending,
hours_ott = `Hours spent on OTT`,
holdout = `Holdout Question`
) %>%
mutate(
age = as.numeric(age),
n_subscriptions = as.numeric(n_subscriptions),
gender = as.factor(gender),
elapsed_minutes = as.numeric(difftime(end_time, start_time, units = "mins")),
across(starts_with("platform_"), ~ ifelse(is.na(.), 0L, 1L))
)
# Rename CBC task columns
for (i in 1:10) {
names(survey)[names(survey) == paste0("CBC - Random_", i)] <- paste0("cbc_task_", i)
}
survey <- survey %>%
mutate(across(starts_with("cbc_task_"), ~ as.integer(.x)))
cat("Respondents:", nrow(survey), "\n")
## Respondents: 106
cat("Unique IDs:", length(unique(survey$record_id)), "\n")
## Unique IDs: 106
cat("Duplicates:", sum(duplicated(survey$record_id)), "\n")
## Duplicates: 0
The design file contains the full CBC experiment in long format: one row per alternative per task per respondent, with integer-coded attribute levels and a binary response indicator.
cat("Design rows:", nrow(design_raw), "\n")
## Design rows: 4200
cat("Respondents in design:", length(unique(design_raw$`Record ID`)), "\n")
## Respondents in design: 105
cat("Tasks:", length(unique(design_raw$Task)), "\n")
## Tasks: 10
cat("Concepts per task:", length(unique(design_raw$Concept)), "\n")
## Concepts per task: 4
head(design_raw, 8)
# One respondent in survey has no design data
missing_from_design <- setdiff(survey$record_id, design_raw$`Record ID`)
cat("Respondents missing from design file:", length(missing_from_design), "\n")
## Respondents missing from design file: 1
if (length(missing_from_design) > 0) cat(" ID:", missing_from_design, "\n")
## ID: 698f033464169012b6813518
We restrict the analysis to the 105 respondents present in both datasets.
# Rename design columns
cbc <- design_raw %>%
rename(
record_id = `Record ID`,
task = Task,
alt = Concept,
price_code = `Monthly Price`,
ads_code = Ads,
quality_code = `Video Quality`,
streams_code = `Simultaneous Streams`,
content_code = `Content Type`,
choice = Response
)
# Map integer codes to actual attribute values
price_map <- c(`1` = 4.99, `2` = 7.99, `3` = 9.99, `4` = 12.99, `5` = 14.99)
ads_map <- c(`1` = "With Ads", `2` = "Limited Ads", `3` = "Ad-free")
quality_map <- c(`1` = "HD 720p", `2` = "Full HD 1080p", `3` = "4K HDR")
streams_map <- c(`1` = "1 screen", `2` = "2 screens", `3` = "4 screens")
content_map <- c(`1` = "Licensed Only", `2` = "Originals & Licensed", `3` = "Exclusive Originals")
cbc <- cbc %>%
mutate(
# Numeric price (0 for None)
Price = ifelse(price_code == 0, 0, price_map[as.character(price_code)]),
# NONE indicator
NONE = as.integer(alt == 4),
# Create numeric respondent id
id = as.integer(factor(record_id, levels = unique(record_id)))
)
cat("CBC data dimensions:", nrow(cbc), "rows x", ncol(cbc), "cols\n")
## CBC data dimensions: 4200 rows x 12 cols
cat("Respondents:", length(unique(cbc$id)), "\n")
## Respondents: 105
cat("Total choice sets:", nrow(cbc) / 4, "\n")
## Total choice sets: 1050
cat("Choices (Response=1):", sum(cbc$choice), "\n")
## Choices (Response=1): 1050
# Exactly one choice per choice set
choices_per_set <- cbc %>%
group_by(id, task) %>%
summarise(n_chosen = sum(choice), .groups = "drop")
cat("All choice sets have exactly 1 choice:",
all(choices_per_set$n_chosen == 1), "\n")
## All choice sets have exactly 1 choice: TRUE
# None option always in alt = 4 with all codes = 0
none_rows <- cbc %>% filter(NONE == 1)
cat("None rows always have zero attribute codes:",
all(none_rows$price_code == 0 &
none_rows$ads_code == 0 &
none_rows$quality_code == 0 &
none_rows$streams_code == 0 &
none_rows$content_code == 0), "\n")
## None rows always have zero attribute codes: TRUE
survey %>%
ggplot(aes(x = elapsed_minutes)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white", alpha = 0.8) +
geom_vline(xintercept = median(survey$elapsed_minutes, na.rm = TRUE),
linetype = "dashed", color = "darkred", linewidth = 0.8) +
annotate("text",
x = median(survey$elapsed_minutes, na.rm = TRUE) + 1,
y = Inf, vjust = 2, hjust = 0,
label = paste0("Median: ", round(median(survey$elapsed_minutes, na.rm = TRUE), 1), " min"),
color = "darkred", fontface = "bold") +
scale_x_continuous(breaks = seq(0, 30, 2), limits = c(0, 30)) +
labs(title = "Distribution of Survey Completion Times",
subtitle = "Excluding extreme outliers (>30 min)",
x = "Elapsed Time (minutes)", y = "Count") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"),
axis.text = element_text(color = "black"))
We identify low-quality respondents using the following criteria:
# Match survey to design respondents
survey_matched <- survey %>% filter(record_id %in% unique(cbc$record_id))
survey_matched <- survey_matched %>%
mutate(
none_count = rowSums(across(starts_with("cbc_task_"), ~ .x == 4)),
same_choice_count = apply(
select(., starts_with("cbc_task_")), 1,
function(x) max(table(x))
),
flag_speeder = elapsed_minutes < 3,
flag_straightliner = same_choice_count >= 8,
# Remove ONLY if both speeder AND straightliner
flag_remove = flag_speeder & flag_straightliner
)
data.frame(
Flag = c("Speeders (<3 min)", "Straight-liners (≥8 same)",
"Speeder AND Straight-liner (removed)",
"Retained respondents"),
Count = c(
sum(survey_matched$flag_speeder),
sum(survey_matched$flag_straightliner),
sum(survey_matched$flag_remove),
sum(!survey_matched$flag_remove)
)
) %>%
kable(caption = "Response Quality Assessment (n=105)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Flag | Count |
|---|---|
| Speeders (<3 min) | 11 |
| Straight-liners (≥8 same) | 10 |
| Speeder AND Straight-liner (removed) | 4 |
| Retained respondents | 101 |
removed <- survey_matched %>% filter(flag_remove)
if (nrow(removed) > 0) {
removed %>%
select(record_id, elapsed_minutes, same_choice_count, none_count, gender, age) %>%
mutate(elapsed_minutes = round(elapsed_minutes, 1)) %>%
kable(
caption = "Removed Respondents (Speeder + Straight-liner)",
col.names = c("Record ID", "Minutes", "Max Same Choice", "None Count", "Gender", "Age")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
}
| Record ID | Minutes | Max Same Choice | None Count | Gender | Age |
|---|---|---|---|---|---|
| 6984b3007126363e524407e0 | 2.1 | 10 | 0 | Male | 22 |
| 6984bc1a9b5bee61d8bb6048 | 2.4 | 10 | 0 | Female | 23 |
| 6984fa307126363e52445a50 | 3.0 | 10 | 0 | Male | 21 |
| 6987717c6851f6f632dd9502 | 2.8 | 10 | 0 | Female | 22 |
# Remove flagged respondents from both survey and CBC design data
ids_to_remove <- survey_matched %>% filter(flag_remove) %>% pull(record_id)
survey_matched <- survey_matched %>% filter(!flag_remove)
cbc <- cbc %>% filter(!record_id %in% ids_to_remove)
cat("Respondents after removal:", nrow(survey_matched), "\n")
## Respondents after removal: 101
cat("CBC rows after removal:", nrow(cbc), "\n")
## CBC rows after removal: 4040
cat("Respondents in CBC:", length(unique(cbc$id)), "\n")
## Respondents in CBC: 101
# Reassign numeric IDs after removal
cbc <- cbc %>%
mutate(id = as.integer(factor(record_id, levels = unique(record_id))))
We apply a conservative removal criterion (both conditions must hold) to preserve sample size. Respondents who completed quickly but varied their answers likely engaged with the task. Non-subscribers choosing “None” consistently reflect a legitimate preference, not inattention.
survey_matched %>%
count(gender) %>%
mutate(pct = round(n / sum(n) * 100, 1)) %>%
ggplot(aes(x = gender, y = n, fill = gender)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_text(aes(label = paste0(n, " (", pct, "%)")),
vjust = -0.5, fontface = "bold", size = 4) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Gender Distribution", x = "", y = "Count") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"),
legend.position = "none",
axis.text = element_text(color = "black"))
survey_matched %>%
ggplot(aes(x = age)) +
geom_histogram(bins = 20, fill = "steelblue", color = "white", alpha = 0.8) +
geom_vline(xintercept = median(survey_matched$age, na.rm = TRUE),
linetype = "dashed", color = "darkred", linewidth = 0.8) +
annotate("text",
x = median(survey_matched$age, na.rm = TRUE) + 1.5,
y = Inf, vjust = 2, hjust = 0,
label = paste0("Median: ", median(survey_matched$age, na.rm = TRUE)),
color = "darkred", fontface = "bold") +
labs(title = "Age Distribution", x = "Age", y = "Count") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"),
axis.text = element_text(color = "black"))
p1 <- survey_matched %>%
count(employment) %>%
mutate(pct = round(n / sum(n) * 100, 1), employment = fct_reorder(employment, n)) %>%
ggplot(aes(x = n, y = employment)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
geom_text(aes(label = paste0(n, " (", pct, "%)")), hjust = -0.1, size = 3.5) +
scale_x_continuous(expand = expansion(mult = c(0, 0.3))) +
labs(title = "Employment Status", x = "Count", y = "") +
theme_bw(base_size = 12) +
theme(plot.title = element_text(face = "bold"))
p2 <- survey_matched %>%
count(income) %>%
mutate(
pct = round(n / sum(n) * 100, 1),
income = factor(income, levels = c(
"Less than €1499", "€1500 - €2499", "€2500 - €3499",
"€3500 - €4999", "More than €5000"
))
) %>%
filter(!is.na(income)) %>%
ggplot(aes(x = income, y = n, fill = income)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_text(aes(label = paste0(n, " (", pct, "%)")), vjust = -0.5, size = 3.5) +
scale_fill_brewer(palette = "Pastel1") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 12)) +
labs(title = "Monthly Net Income", x = "", y = "Count") +
theme_bw(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")
p1
p2
The sample is predominantly young (median age 24), consisting mostly of students and early-career professionals with low-to-moderate incomes — typical of a university convenience sample.
survey_matched %>%
count(ott_use) %>%
mutate(pct = round(n / sum(n) * 100, 1), ott_use = fct_reorder(ott_use, n)) %>%
ggplot(aes(x = n, y = ott_use, fill = ott_use)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_text(aes(label = paste0(n, " (", pct, "%)")),
hjust = -0.1, fontface = "bold", size = 4) +
scale_x_continuous(expand = expansion(mult = c(0, 0.3))) +
scale_fill_brewer(palette = "Set2") +
labs(title = "OTT Subscription Status", x = "Count", y = "") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")
data.frame(
Platform = c("Netflix", "Disney+", "Amazon Prime", "HBO Max", "Apple TV+", "Other"),
Users = c(
sum(survey_matched$platform_netflix), sum(survey_matched$platform_disney),
sum(survey_matched$platform_amazon), sum(survey_matched$platform_hbo),
sum(survey_matched$platform_apple), sum(survey_matched$platform_other)
)
) %>%
mutate(Pct = round(Users / nrow(survey_matched) * 100, 1),
Platform = fct_reorder(Platform, Users)) %>%
ggplot(aes(x = Users, y = Platform, fill = Platform)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_text(aes(label = paste0(Users, " (", Pct, "%)")),
hjust = -0.1, fontface = "bold", size = 4) +
scale_x_continuous(expand = expansion(mult = c(0, 0.3))) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Platform Usage", x = "Number of Users", y = "") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")
Before model estimation, we test a few associations between respondent characteristics and their CBC choices.
Do higher-income respondents tend to choose more expensive streaming options? We compute the average price of each respondent’s chosen alternatives (excluding “None” choices) and correlate it with income.
# Compute average chosen price per respondent
avg_chosen_price <- cbc %>%
filter(choice == 1 & NONE == 0) %>%
group_by(record_id) %>%
summarise(avg_price = mean(Price), n_real_choices = n(), .groups = "drop")
# Map income to ordinal numeric
income_order <- c(
"Less than €1499" = 1,
"€1500 - €2499" = 2,
"€2500 - €3499" = 3,
"€3500 - €4999" = 4,
"More than €5000" = 5
)
corr_price_income <- survey_matched %>%
select(record_id, income) %>%
mutate(income_num = income_order[as.character(income)]) %>%
inner_join(avg_chosen_price, by = "record_id") %>%
filter(!is.na(income_num))
cor_val <- round(cor(corr_price_income$income_num,
corr_price_income$avg_price,
use = "complete.obs"), 3)
income_labels <- c(`1` = "<€1.5k", `2` = "€1.5-2.5k", `3` = "€2.5-3.5k",
`4` = "€3.5-5k", `5` = "€5k+")
corr_price_income %>%
mutate(income_label = income_labels[as.character(income_num)]) %>%
ggplot(aes(x = reorder(income_label, income_num), y = avg_price)) +
geom_boxplot(fill = "steelblue", alpha = 0.6) +
geom_jitter(width = 0.15, alpha = 0.4, size = 2) +
annotate("text", x = 1, y = max(corr_price_income$avg_price) + 0.3,
label = paste0("Spearman r = ", cor_val), hjust = 0,
fontface = "italic", size = 4, color = "darkred") +
labs(title = "Average Chosen Price vs Income Level",
subtitle = "Among non-None choices only",
x = "Income Level", y = "Average Chosen Price (€)") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"),
axis.text = element_text(color = "black"))
cat("Spearman correlation (income vs avg chosen price):", cor_val, "\n")
## Spearman correlation (income vs avg chosen price): 0.058
cor.test(corr_price_income$income_num, corr_price_income$avg_price,
method = "spearman")
##
## Spearman's rank correlation rho
##
## data: corr_price_income$income_num and corr_price_income$avg_price
## S = 150590, p-value = 0.3402
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.09636681
We examine whether Bachelor’s-level respondents differ from Master’s-level respondents in their subscription behavior (self-paying vs shared) and content preferences.
edu_sharing <- survey_matched %>%
filter(!is.na(ott_use) & ott_use != "No" & !is.na(education)) %>%
mutate(
edu_level = case_when(
grepl("High school", education) ~ "High School",
grepl("Bachelor", education) ~ "Bachelor's",
grepl("Master", education) ~ "Master's"
),
account_type = ifelse(
grepl("family|friend", ott_use, ignore.case = TRUE),
"Shared", "Self-Paying"
)
) %>%
filter(edu_level %in% c("Bachelor's", "Master's"))
# Cross-tabulation
edu_xtab <- table(edu_sharing$edu_level, edu_sharing$account_type)
cat("Education x Account Type:\n")
## Education x Account Type:
print(edu_xtab)
##
## Self-Paying Shared
## Bachelor's 26 25
## Master's 26 22
cat("\nProportions (row-wise):\n")
##
## Proportions (row-wise):
print(round(prop.table(edu_xtab, margin = 1) * 100, 1))
##
## Self-Paying Shared
## Bachelor's 51.0 49.0
## Master's 54.2 45.8
# Chi-squared test
chi_test <- chisq.test(edu_xtab, correct = FALSE)
cat("\nChi-squared test p-value:", round(chi_test$p.value, 4), "\n")
##
## Chi-squared test p-value: 0.751
# Visualization
edu_sharing %>%
count(edu_level, account_type) %>%
group_by(edu_level) %>%
mutate(pct = round(n / sum(n) * 100, 1)) %>%
ggplot(aes(x = edu_level, y = pct, fill = account_type)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
geom_text(aes(label = paste0(pct, "%")),
position = position_dodge(0.9), vjust = -0.5, fontface = "bold", size = 4) +
scale_fill_brewer(palette = "Set2", name = "Account Type") +
scale_y_continuous(limits = c(0, 80)) +
labs(title = "Account Type by Education Level",
subtitle = "Among OTT subscribers only",
x = "", y = "Percentage (%)") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"),
legend.position = "bottom",
axis.text = element_text(color = "black"))
# Number of subscriptions by education
edu_subs <- survey_matched %>%
filter(!is.na(n_subscriptions) & !is.na(education)) %>%
mutate(edu_level = case_when(
grepl("High school", education) ~ "High School",
grepl("Bachelor", education) ~ "Bachelor's",
grepl("Master", education) ~ "Master's"
)) %>%
filter(edu_level %in% c("Bachelor's", "Master's"))
edu_subs %>%
group_by(edu_level) %>%
summarise(
N = n(),
Mean_Subs = round(mean(n_subscriptions), 2),
SD = round(sd(n_subscriptions), 2),
.groups = "drop"
) %>%
kable(
caption = "Number of Subscriptions by Education Level",
col.names = c("Education", "N", "Mean Subscriptions", "SD")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Education | N | Mean Subscriptions | SD |
|---|---|---|---|
| Bachelor’s | 47 | 3.28 | 1.57 |
| Master’s | 44 | 2.70 | 1.29 |
none_chosen_pct <- round(
sum(cbc$choice == 1 & cbc$NONE == 1) / sum(cbc$choice == 1) * 100, 1
)
cbc %>%
filter(choice == 1) %>%
mutate(choice_label = ifelse(NONE == 1, "None",
paste0("Alt ", alt))) %>%
count(choice_label) %>%
mutate(pct = round(n / sum(n) * 100, 1)) %>%
ggplot(aes(x = choice_label, y = pct,
fill = choice_label)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_text(aes(label = paste0(pct, "%")),
vjust = -0.5, fontface = "bold", size = 5) +
scale_fill_manual(values = c("#4DAF4A", "#377EB8", "#FF7F00", "#E41A1C")) +
scale_y_continuous(limits = c(0, 40)) +
labs(title = "Overall Choice Proportions",
subtitle = paste0("None chosen in ", none_chosen_pct, "% of all tasks"),
x = "", y = "Percentage (%)") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"),
legend.position = "none",
axis.text = element_text(color = "black"))
# Among non-None alternatives, how does choice relate to price?
cbc_real <- cbc %>% filter(NONE == 0)
xtab_price <- xtabs(~ Price + choice, data = cbc_real)
cat("Cross-tabulation: Price x Choice\n")
## Cross-tabulation: Price x Choice
print(xtab_price)
## choice
## Price 0 1
## 4.99 374 232
## 7.99 408 198
## 9.99 429 177
## 12.99 486 120
## 14.99 509 97
cat("\nRelative frequency of choices by price level:\n")
##
## Relative frequency of choices by price level:
rel_freq <- xtabs(choice ~ Price, data = cbc_real) /
sum(xtabs(choice ~ Price, data = cbc_real))
print(round(rel_freq, 3))
## Price
## 4.99 7.99 9.99 12.99 14.99
## 0.282 0.240 0.215 0.146 0.118
plot(xtabs(~ Price + choice, data = cbc_real),
main = "Mosaic Plot: Price vs Choice",
color = c("lightcoral", "steelblue"))
cbc_real <- cbc_real %>%
mutate(Ads = ads_map[as.character(ads_code)])
cat("Relative frequency of choices by ad level:\n")
## Relative frequency of choices by ad level:
rel_freq_ads <- xtabs(choice ~ Ads, data = cbc_real) /
sum(xtabs(choice ~ Ads, data = cbc_real))
print(round(rel_freq_ads, 3))
## Ads
## Ad-free Limited Ads With Ads
## 0.500 0.319 0.181
plot(xtabs(~ Ads + choice, data = cbc_real),
main = "Mosaic Plot: Ads vs Choice",
color = c("lightcoral", "steelblue"))
\[N \geq \frac{500 \cdot c}{t \cdot a}\]
N <- length(unique(cbc$id)) # 105
c_max <- 5 # Price has 5 levels
t_tasks <- 10 # 10 CBC tasks
a_alts <- 3 # 3 alternatives (excl. None)
required_N <- ceiling(500 * c_max / (t_tasks * a_alts))
actual_ratio <- N * t_tasks * a_alts / c_max
cat("N =", N, "| c =", c_max, "| t =", t_tasks, "| a =", a_alts, "\n")
## N = 101 | c = 5 | t = 10 | a = 3
cat("Minimum N required:", required_N, "\n")
## Minimum N required: 84
cat("N × t × a / c =", actual_ratio, "(≥ 500 required)\n")
## N × t × a / c = 606 (≥ 500 required)
cat("Result:", ifelse(actual_ratio >= 500, "SUFFICIENT", "INSUFFICIENT"), "\n")
## Result: SUFFICIENT
Two coding decisions shape the estimation: effects coding for categorical attributes and a linear mean-centered price.
Categorical attributes need numeric encoding. The two standard options are dummy coding (reference = 0) and effects coding (reference = −1). We use effects coding because:
Price enters the model as a single linear parameter rather than a set of level dummies:
# Effects coding: reference level coded as -1 (not 0)
effects_coding <- function(.data, vars, none = FALSE) {
variables <- names(dplyr::select(.data, {{ vars }}))
ws <- .data
if (isTRUE(none)) {
none_ws <- filter(ws, apply(ws[variables], 1, sum) == 0)
ws <- filter(ws, apply(ws[variables], 1, sum) != 0)
}
for (a in variables) {
current <- names(ws)
ws <- ws %>%
fastDummies::dummy_cols(select_columns = a, remove_first_dummy = TRUE)
helper <- setdiff(names(ws), current)
ws[["del"]] <- apply(ws[helper], 1, sum)
ws <- ws %>%
mutate(across(all_of(helper), ~ ifelse(del == 0, -1, .x)), del = NULL)
}
if (isTRUE(none)) {
mis_names <- setdiff(names(ws), names(none_ws))
none_ws[mis_names] <- 0
ws <- rbind(ws, none_ws)
}
return(ws)
}
# Mean-center price (excluding None)
mc_price <- function(.data, price) {
price_var <- dplyr::select(.data, {{ price }}) %>% pull()
mw <- mean(price_var[price_var != 0])
price_mc <- ifelse(price_var != 0, price_var - mw, 0)
.data %>% mutate(price_mc = price_mc)
}
# Numerically stable MNL probability
log_sum_exp <- function(x) {
cc <- max(x)
cc + log(sum(exp(x - cc)))
}
mnl_prob <- function(x) {
exp(x - log_sum_exp(x))
}
# The design data already has integer-coded attributes.
# We convert to the format needed for effects coding.
cbc_clean <- cbc %>%
select(id, record_id, task, alt, choice, NONE,
ads_code, quality_code, streams_code, content_code, Price) %>%
rename(ads = ads_code, quality = quality_code,
streams = streams_code, content = content_code) %>%
# For NONE rows, set attribute codes to 0 (already the case)
mutate(across(c(ads, quality, streams, content), ~ ifelse(NONE == 1, 0L, .x)))
# Apply effects coding to categorical attributes
cbc_eff <- cbc_clean %>%
effects_coding(vars = c(ads, quality, streams, content), none = TRUE) %>%
arrange(id, task, alt) %>%
mc_price(Price) %>%
mutate(obs = cumsum(c(1, diff(task) != 0 | diff(id) != 0)))
# Preview
cbc_eff %>%
filter(id == 1) %>%
head(8)
Each non-None alternative is represented by 8 effects-coded dummies (2 per categorical attribute), 1 mean-centered price, and 1 NONE indicator — 10 parameters total. For the None alternative all columns are 0 except NONE = 1.
We estimate a multinomial logit model with:
choice_data <- dfidx(
cbc_eff,
shape = "long",
choice = "choice",
idx = c("obs", "alt")
)
mnl1 <- mlogit(
choice ~ 0 +
NONE +
ads_2 + ads_3 +
quality_2 + quality_3 +
streams_2 + streams_3 +
content_2 + content_3 +
price_mc,
data = choice_data
)
summary(mnl1)
##
## Call:
## mlogit(formula = choice ~ 0 + NONE + ads_2 + ads_3 + quality_2 +
## quality_3 + streams_2 + streams_3 + content_2 + content_3 +
## price_mc, data = choice_data, method = "nr")
##
## Frequencies of alternatives:choice
## 1 2 3 4
## 0.32277 0.24851 0.24455 0.18416
##
## nr method
## 4 iterations, 0h:0m:0s
## g'(-H)^-1g = 5.77E-08
## gradient close to zero
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## NONE -0.192413 0.084512 -2.2768 0.02280 *
## ads_2 0.033089 0.058111 0.5694 0.56908
## ads_3 0.611839 0.055189 11.0862 < 2.2e-16 ***
## quality_2 0.102895 0.055957 1.8388 0.06594 .
## quality_3 0.235846 0.055748 4.2306 2.331e-05 ***
## streams_2 -0.017984 0.057939 -0.3104 0.75626
## streams_3 0.293894 0.055157 5.3283 9.911e-08 ***
## content_2 0.391206 0.054593 7.1658 7.732e-13 ***
## content_3 -0.260143 0.059710 -4.3568 1.320e-05 ***
## price_mc -0.125254 0.012594 -9.9452 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -1205.9
mnl0 <- mlogit(
choice ~ 0 + NONE + price_mc,
data = choice_data
)
lrtest(mnl0, mnl1)
The full model significantly outperforms the price-only model, so the non-price attributes add real explanatory power.
With effects coding, the estimated coefficients are already deviations from the attribute mean. The omitted reference levels have part-worth equal to the negative sum of the included levels.
coefs <- coef(mnl1)
# Build part-worth table
# Effects coding: reference = -sum(included levels)
pw_df <- data.frame(
var = c(
"NONE",
"With Ads", "Limited Ads", "Ad-free",
"HD 720p", "Full HD 1080p", "4K HDR",
"1 screen", "2 screens", "4 screens",
"Licensed Only", "Originals & Licensed", "Exclusive Originals",
"Price (per €1)"
),
attr = c(
"",
rep("Ads", 3),
rep("Video Quality", 3),
rep("Simultaneous Streams", 3),
rep("Content Type", 3),
"Price"
),
est = c(
coefs["NONE"],
# Ads: ref = With Ads = -(ads_2 + ads_3)
-(coefs["ads_2"] + coefs["ads_3"]),
coefs["ads_2"],
coefs["ads_3"],
# Quality: ref = HD 720p = -(quality_2 + quality_3)
-(coefs["quality_2"] + coefs["quality_3"]),
coefs["quality_2"],
coefs["quality_3"],
# Streams: ref = 1 screen = -(streams_2 + streams_3)
-(coefs["streams_2"] + coefs["streams_3"]),
coefs["streams_2"],
coefs["streams_3"],
# Content: ref = Licensed Only = -(content_2 + content_3)
-(coefs["content_2"] + coefs["content_3"]),
coefs["content_2"],
coefs["content_3"],
# Price
coefs["price_mc"]
),
order = 1:14
)
rownames(pw_df) <- NULL
pw_df %>%
mutate(est = round(est, 4)) %>%
kable(
caption = "MNL Part-Worth Utilities (Effects Coding)",
col.names = c("Level", "Attribute", "Part-Worth", "Order")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
collapse_rows(columns = 2, valign = "top")
| Level | Attribute | Part-Worth | Order |
|---|---|---|---|
| NONE | -0.1924 | 1 | |
| With Ads | Ads | -0.6449 | 2 |
| Limited Ads | 0.0331 | 3 | |
| Ad-free | 0.6118 | 4 | |
| HD 720p | Video Quality | -0.3387 | 5 |
| Full HD 1080p | 0.1029 | 6 | |
| 4K HDR | 0.2358 | 7 | |
| 1 screen | Simultaneous Streams | -0.2759 | 8 |
| 2 screens | -0.0180 | 9 | |
| 4 screens | 0.2939 | 10 | |
| Licensed Only | Content Type | -0.1311 | 11 |
| Originals & Licensed | 0.3912 | 12 | |
| Exclusive Originals | -0.2601 | 13 | |
| Price (per €1) | Price | -0.1253 | 14 |
pw_df %>%
filter(attr != "" & attr != "Price") %>%
ggplot(aes(x = reorder(var, order), y = est, group = attr)) +
geom_point(size = 3, color = "steelblue") +
geom_line(color = "steelblue", linewidth = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", color = "darkblue") +
facet_wrap(~attr, scales = "free_x") +
labs(title = "Part-Worth Utilities (MNL)",
x = "Attribute Levels", y = "Part-Worth Utility") +
theme_bw(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 35, hjust = 1, size = 9),
axis.text = element_text(color = "black"),
strip.text = element_text(face = "bold")
)
The importance of each attribute is computed as the range of its part-worth utilities divided by the sum of all ranges.
\[\text{RAI}_k = \frac{\max(\text{PW}_k) - \min(\text{PW}_k)}{\sum_{k=1}^{K} [\max(\text{PW}_k) - \min(\text{PW}_k)]} \times 100\]
For price (linear specification), the range is \(|\beta_{\text{price}}| \times (\text{Price}_{max} - \text{Price}_{min})\).
price_range <- (14.99 - 4.99) * abs(coefs["price_mc"])
RAI <- pw_df %>%
filter(attr != "" & attr != "Price") %>%
group_by(attr) %>%
summarise(range = max(est) - min(est), .groups = "drop") %>%
rbind(data.frame(attr = "Price", range = as.numeric(price_range))) %>%
mutate(RAI = round(range / sum(range) * 100, 1)) %>%
arrange(desc(RAI))
RAI %>%
kable(
caption = "Relative Attribute Importance (MNL)",
col.names = c("Attribute", "Range", "RAI (%)")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Attribute | Range | RAI (%) |
|---|---|---|
| Ads | 1.2567664 | 29.2 |
| Price | 1.2525447 | 29.1 |
| Content Type | 0.6513495 | 15.1 |
| Video Quality | 0.5745861 | 13.3 |
| Simultaneous Streams | 0.5698041 | 13.2 |
ggplot(RAI, aes(x = reorder(attr, RAI), y = RAI,
label = paste0(RAI, "%"))) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
geom_text(hjust = -0.2, fontface = "bold", size = 4.5) +
coord_flip() +
scale_y_continuous(limits = c(0, max(RAI$RAI) * 1.2),
breaks = seq(0, 50, 10)) +
labs(title = "Relative Attribute Importance (MNL)",
x = "", y = "RAI (%)") +
theme_bw(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
axis.title = element_text(face = "bold"),
axis.text = element_text(color = "black")
)
\[\text{WTP}_k = \frac{-\beta_k}{\beta_{\text{price}}}\]
# WTP for all non-price attribute levels
wtp_df <- pw_df %>%
filter(attr != "" & attr != "Price") %>%
mutate(
WTP = round(-est / coefs["price_mc"], 2)
)
wtp_df %>%
select(attr, var, est, WTP) %>%
kable(
caption = "Willingness-to-Pay (€) for Non-Price Attribute Levels",
col.names = c("Attribute", "Level", "Part-Worth", "WTP (€)")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
collapse_rows(columns = 1, valign = "top")
| Attribute | Level | Part-Worth | WTP (€) |
|---|---|---|---|
| Ads | With Ads | -0.6449275 | -5.15 |
| Limited Ads | 0.0330885 | 0.26 | |
| Ad-free | 0.6118389 | 4.88 | |
| Video Quality | HD 720p | -0.3387403 | -2.70 |
| Full HD 1080p | 0.1028945 | 0.82 | |
| 4K HDR | 0.2358458 | 1.88 | |
| Simultaneous Streams | 1 screen | -0.2759100 | -2.20 |
| 2 screens | -0.0179841 | -0.14 | |
| 4 screens | 0.2938941 | 2.35 | |
| Content Type | Licensed Only | -0.1310625 | -1.05 |
| Originals & Licensed | 0.3912060 | 3.12 | |
| Exclusive Originals | -0.2601435 | -2.08 |
# Cross-check with gmnl package
wtp.gmnl(mnl1, wrt = "price_mc")
##
## Willigness-to-pay respect to: price_mc
##
## Estimate Std. Error t-value Pr(>|t|)
## NONE 1.53618 0.70991 2.1639 0.03047 *
## ads_2 -0.26417 0.46440 -0.5688 0.56946
## ads_3 -4.88477 0.62924 -7.7629 8.216e-15 ***
## quality_2 -0.82148 0.45248 -1.8155 0.06945 .
## quality_3 -1.88293 0.47579 -3.9575 7.573e-05 ***
## streams_2 0.14358 0.46260 0.3104 0.75627
## streams_3 -2.34638 0.49031 -4.7855 1.705e-06 ***
## content_2 -3.12329 0.51148 -6.1064 1.019e-09 ***
## content_3 2.07692 0.51515 4.0317 5.538e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
wtp_df %>%
ggplot(aes(x = reorder(var, WTP), y = WTP, fill = attr)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_text(aes(label = paste0("€", WTP)),
hjust = ifelse(wtp_df$WTP >= 0, -0.1, 1.1),
fontface = "bold", size = 3.5) +
coord_flip() +
scale_fill_brewer(palette = "Set2", name = "Attribute") +
labs(title = "Willingness-to-Pay by Attribute Level",
x = "", y = "WTP (€)") +
theme_bw(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
axis.text = element_text(color = "black"),
legend.position = "bottom"
)
The HB-MNL model estimates individual-level part-worths, allowing us to account for preference heterogeneity across respondents.
# Create the choices vector (ChoiceModelR format)
indexes <- cbc_eff %>%
group_by(id, task) %>%
reframe(
ch = alt[choice == 1],
no = max(alt)
)
choices_vec <- unname(unlist(purrr::map(seq.int(nrow(indexes)), function(x) {
c(indexes$ch[x], rep(0, times = indexes$no[x] - 1))
})))
# Build design matrix
dm <- cbc_eff %>%
mutate(choices = choices_vec) %>%
select(id, task, alt,
ads_2, ads_3,
quality_2, quality_3,
streams_2, streams_3,
content_2, content_3,
price_mc, NONE, choices) %>%
as.data.frame() %>%
mutate(task = cumsum(c(1, diff(alt) < 0)), .by = id)
# Number of parameters (everything except id, task, alt, choices)
n_params <- ncol(dm) - 4
cat("Parameters to estimate:", n_params, "\n")
## Parameters to estimate: 10
cat("Design matrix dimensions:", nrow(dm), "x", ncol(dm), "\n")
## Design matrix dimensions: 4040 x 14
dir.create("hbmnl", showWarnings = FALSE)
set.seed(42)
hb_out <- choicemodelr(
data = dm,
xcoding = rep(1, times = n_params),
mcmc = list(R = 20000, use = 10000),
options = list(none = FALSE, save = TRUE, keep = 10),
directory = "hbmnl"
)
# Beta point estimates: mean across MCMC draws for each individual
betas_ind <- apply(hb_out$betadraw, c(1, 2), mean)
# Column names match the order in dm
param_names <- c("ads_2", "ads_3", "quality_2", "quality_3",
"streams_2", "streams_3", "content_2", "content_3",
"price_mc", "NONE")
colnames(betas_ind) <- param_names
cat("Individual betas matrix:", nrow(betas_ind), "respondents x",
ncol(betas_ind), "parameters\n")
## Individual betas matrix: 101 respondents x 10 parameters
# Population-level means and SDs
hb_means <- colMeans(betas_ind)
hb_sds <- apply(betas_ind, 2, sd)
# Reconstruct reference levels
hb_pw <- data.frame(
var = c(
"With Ads", "Limited Ads", "Ad-free",
"HD 720p", "Full HD 1080p", "4K HDR",
"1 screen", "2 screens", "4 screens",
"Licensed Only", "Originals & Licensed", "Exclusive Originals",
"Price (per €1)", "NONE"
),
attr = c(
rep("Ads", 3), rep("Video Quality", 3),
rep("Simultaneous Streams", 3), rep("Content Type", 3),
"Price", ""
),
Mean = c(
-(hb_means["ads_2"] + hb_means["ads_3"]),
hb_means["ads_2"], hb_means["ads_3"],
-(hb_means["quality_2"] + hb_means["quality_3"]),
hb_means["quality_2"], hb_means["quality_3"],
-(hb_means["streams_2"] + hb_means["streams_3"]),
hb_means["streams_2"], hb_means["streams_3"],
-(hb_means["content_2"] + hb_means["content_3"]),
hb_means["content_2"], hb_means["content_3"],
hb_means["price_mc"],
hb_means["NONE"]
),
SD = c(
# For reference levels, compute SD of -(ind_ads_2 + ind_ads_3)
sd(-(betas_ind[, "ads_2"] + betas_ind[, "ads_3"])),
hb_sds["ads_2"], hb_sds["ads_3"],
sd(-(betas_ind[, "quality_2"] + betas_ind[, "quality_3"])),
hb_sds["quality_2"], hb_sds["quality_3"],
sd(-(betas_ind[, "streams_2"] + betas_ind[, "streams_3"])),
hb_sds["streams_2"], hb_sds["streams_3"],
sd(-(betas_ind[, "content_2"] + betas_ind[, "content_3"])),
hb_sds["content_2"], hb_sds["content_3"],
hb_sds["price_mc"],
hb_sds["NONE"]
)
)
hb_pw %>%
mutate(Mean = round(Mean, 4), SD = round(SD, 4)) %>%
kable(
caption = "HB-MNL Part-Worth Utilities: Population Means and SDs",
col.names = c("Level", "Attribute", "Mean", "SD")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
collapse_rows(columns = 2, valign = "top")
| Level | Attribute | Mean | SD |
|---|---|---|---|
| With Ads | Ads | -2.5548 | 1.8171 |
| Limited Ads | 0.2656 | 0.7766 | |
| Ad-free | 2.2891 | 2.0522 | |
| HD 720p | Video Quality | -1.1136 | 1.3191 |
| Full HD 1080p | 0.2862 | 0.7873 | |
| 4K HDR | 0.8274 | 0.9130 | |
| 1 screen | Simultaneous Streams | -1.0788 | 1.1061 |
| 2 screens | -0.0115 | 0.7487 | |
| 4 screens | 1.0903 | 1.1951 | |
| Licensed Only | Content Type | -0.3226 | 0.9406 |
| Originals & Licensed | 1.1446 | 1.2149 | |
| Exclusive Originals | -0.8220 | 1.0484 | |
| Price (per €1) | Price | -0.4478 | 0.5664 |
| NONE | -0.0763 | 4.2024 |
# Compute RAI at the individual level, then average
compute_individual_rai <- function(beta_row) {
# Reconstruct part-worths for each attribute
ads_pw <- c(-(beta_row["ads_2"] + beta_row["ads_3"]),
beta_row["ads_2"], beta_row["ads_3"])
quality_pw <- c(-(beta_row["quality_2"] + beta_row["quality_3"]),
beta_row["quality_2"], beta_row["quality_3"])
streams_pw <- c(-(beta_row["streams_2"] + beta_row["streams_3"]),
beta_row["streams_2"], beta_row["streams_3"])
content_pw <- c(-(beta_row["content_2"] + beta_row["content_3"]),
beta_row["content_2"], beta_row["content_3"])
price_range_i <- abs(beta_row["price_mc"]) * (14.99 - 4.99)
ranges <- c(
Price = as.numeric(price_range_i),
Ads = diff(range(ads_pw)),
`Video Quality` = diff(range(quality_pw)),
`Simultaneous Streams` = diff(range(streams_pw)),
`Content Type` = diff(range(content_pw))
)
ranges / sum(ranges) * 100
}
rai_individual <- t(apply(betas_ind, 1, compute_individual_rai))
rai_hb <- data.frame(
Attribute = colnames(rai_individual),
Mean = round(colMeans(rai_individual), 1),
SD = round(apply(rai_individual, 2, sd), 1)
) %>%
arrange(desc(Mean))
rai_hb %>%
kable(
caption = "HB-MNL Relative Attribute Importance: Means and SDs",
col.names = c("Attribute", "Mean RAI (%)", "SD (%)")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Attribute | Mean RAI (%) | SD (%) | |
|---|---|---|---|
| Price | Price | 26.9 | 17.5 |
| Ads | Ads | 26.7 | 14.6 |
| Content Type | Content Type | 16.1 | 10.0 |
| Simultaneous Streams | Simultaneous Streams | 15.5 | 8.7 |
| Video Quality | Video Quality | 14.7 | 10.1 |
ggplot(rai_hb, aes(x = reorder(Attribute, Mean), y = Mean,
label = paste0(Mean, "%"))) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
geom_errorbar(aes(ymin = Mean - SD, ymax = Mean + SD),
width = 0.3, color = "grey30") +
geom_text(hjust = -0.3, fontface = "bold", size = 4.5) +
coord_flip() +
scale_y_continuous(limits = c(0, max(rai_hb$Mean + rai_hb$SD) * 1.15),
breaks = seq(0, 60, 10)) +
labs(title = "Relative Attribute Importance (HB-MNL)",
subtitle = "Error bars show ±1 SD across respondents",
x = "", y = "RAI (%)") +
theme_bw(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
axis.text = element_text(color = "black")
)
as.data.frame(rai_individual) %>%
pivot_longer(everything(), names_to = "Attribute", values_to = "RAI") %>%
ggplot(aes(x = reorder(Attribute, RAI, FUN = median), y = RAI, fill = Attribute)) +
geom_boxplot(alpha = 0.7) +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribution of Individual Attribute Importance (HB-MNL)",
x = "", y = "RAI (%)") +
theme_bw(base_size = 14) +
theme(plot.title = element_text(face = "bold"),
legend.position = "none",
axis.text = element_text(color = "black"))
# Individual-level WTP: -beta_k / beta_price for each respondent
wtp_individual <- data.frame(
# Ads
`With Ads` = -(-(betas_ind[,"ads_2"] + betas_ind[,"ads_3"])) / betas_ind[,"price_mc"],
`Limited Ads` = -betas_ind[,"ads_2"] / betas_ind[,"price_mc"],
`Ad-free` = -betas_ind[,"ads_3"] / betas_ind[,"price_mc"],
# Quality
`HD 720p` = -(-(betas_ind[,"quality_2"] + betas_ind[,"quality_3"])) / betas_ind[,"price_mc"],
`Full HD 1080p` = -betas_ind[,"quality_2"] / betas_ind[,"price_mc"],
`4K HDR` = -betas_ind[,"quality_3"] / betas_ind[,"price_mc"],
# Streams
`1 screen` = -(-(betas_ind[,"streams_2"] + betas_ind[,"streams_3"])) / betas_ind[,"price_mc"],
`2 screens` = -betas_ind[,"streams_2"] / betas_ind[,"price_mc"],
`4 screens` = -betas_ind[,"streams_3"] / betas_ind[,"price_mc"],
# Content
`Licensed Only` = -(-(betas_ind[,"content_2"] + betas_ind[,"content_3"])) / betas_ind[,"price_mc"],
`Originals & Licensed` = -betas_ind[,"content_2"] / betas_ind[,"price_mc"],
`Exclusive Originals` = -betas_ind[,"content_3"] / betas_ind[,"price_mc"],
check.names = FALSE
)
wtp_summary <- data.frame(
Level = names(wtp_individual),
Attribute = c(rep("Ads", 3), rep("Video Quality", 3),
rep("Simultaneous Streams", 3), rep("Content Type", 3)),
Mean_WTP = round(colMeans(wtp_individual), 2),
SD_WTP = round(apply(wtp_individual, 2, sd), 2)
)
rownames(wtp_summary) <- NULL
wtp_summary %>%
kable(
caption = "HB-MNL Willingness-to-Pay (€): Means and SDs",
col.names = c("Level", "Attribute", "Mean WTP (€)", "SD (€)")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
collapse_rows(columns = 2, valign = "top")
| Level | Attribute | Mean WTP (€) | SD (€) |
|---|---|---|---|
| With Ads | Ads | -172.00 | 1651.12 |
| Limited Ads | 82.49 | 804.30 | |
| Ad-free | 89.52 | 846.96 | |
| HD 720p | Video Quality | -70.88 | 651.78 |
| Full HD 1080p | -13.38 | 151.82 | |
| 4K HDR | 84.25 | 803.09 | |
| 1 screen | Simultaneous Streams | 39.61 | 448.80 |
| 2 screens | 10.73 | 112.41 | |
| 4 screens | -50.34 | 561.10 | |
| Licensed Only | Content Type | -130.23 | 1272.39 |
| Originals & Licensed | 238.23 | 2361.81 | |
| Exclusive Originals | -108.00 | 1089.96 |
We assess predictive validity using a holdout task — a fixed choice set shown to all respondents but excluded from estimation.
holdout_profiles <- data.frame(
Alternative = c("A", "B", "C", "None"),
Price = c(7.99, 12.99, 4.99, 0),
Ads = c("Limited Ads", "Ad-free", "With Ads", "-"),
Quality = c("Full HD 1080p", "4K HDR", "HD 720p", "-"),
Streams = c("2 screens", "4 screens", "1 screen", "-"),
Content = c("Originals & Licensed", "Exclusive Originals", "Licensed Only", "-")
)
holdout_profiles %>%
kable(caption = "Holdout Task: Product Profiles") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Alternative | Price | Ads | Quality | Streams | Content |
|---|---|---|---|---|---|
| A | 7.99 | Limited Ads | Full HD 1080p | 2 screens | Originals & Licensed |
| B | 12.99 | Ad-free | 4K HDR | 4 screens | Exclusive Originals |
| C | 4.99 | With Ads | HD 720p | 1 screen | Licensed Only |
| None | 0.00 |
|
|
|
|
# Mean-centered prices for holdout
mw_price <- mean(cbc$Price[cbc$Price != 0])
# Holdout design matrix (effects coding)
holdout_dm <- data.frame(
ads_2 = c( 1, -1, -1, 0), # Limited Ads / Ad-free / With Ads / None
ads_3 = c(-1, 1, -1, 0),
quality_2 = c( 1, -1, -1, 0), # Full HD / 4K / HD 720p / None
quality_3 = c(-1, 1, -1, 0),
streams_2 = c( 1, -1, -1, 0), # 2 screens / 4 screens / 1 screen / None
streams_3 = c(-1, 1, -1, 0),
content_2 = c( 1, -1, -1, 0), # Orig&Lic / ExclOrig / Licensed / None
content_3 = c(-1, 1, -1, 0),
price_mc = c(7.99 - mw_price, 12.99 - mw_price, 4.99 - mw_price, 0),
NONE = c(0, 0, 0, 1)
)
# Individual-level utilities for holdout alternatives
holdout_utils <- as.data.frame(betas_ind %*% t(as.matrix(holdout_dm))) %>%
setNames(c("A", "B", "C", "None"))
# Actual choices from survey
holdout_actual <- survey_matched %>%
mutate(
holdout_choice = case_when(
grepl("€7.99", holdout) ~ 1L,
grepl("€12.99", holdout) ~ 2L,
grepl("€4.99", holdout) ~ 3L,
TRUE ~ 4L
)
) %>%
pull(holdout_choice)
holdout_utils[["actual"]] <- holdout_actual
predicted_choice <- apply(holdout_utils[, 1:4], 1, which.max)
hit_rate <- round(sum(predicted_choice == holdout_utils$actual) /
nrow(holdout_utils) * 100, 2)
cat("Hit Rate:", hit_rate, "%\n")
## Hit Rate: 59.41 %
cat("(Chance level: 25%)\n")
## (Chance level: 25%)
# Choice probabilities per respondent
ch_probs <- as.data.frame(t(apply(holdout_utils[, 1:4], 1, mnl_prob)))
names(ch_probs) <- c("A", "B", "C", "None")
# Probability assigned to the actually chosen alternative
ch_probs[["actual_label"]] <- c("A", "B", "C", "None")[holdout_utils$actual]
mean_hit_prob <- ch_probs %>%
rowwise() %>%
mutate(hit_prob = get(actual_label)) %>%
ungroup() %>%
summarise(mhp = mean(hit_prob) * 100) %>%
pull(mhp) %>%
round(2)
cat("Mean Hit Probability:", mean_hit_prob, "%\n")
## Mean Hit Probability: 56.88 %
cat("(Chance level: 25%)\n")
## (Chance level: 25%)
# Actual choice shares
actual_shares <- sapply(1:4, function(x) {
sum(holdout_utils$actual == x)
}) / nrow(holdout_utils)
# Predicted choice shares (mean probabilities)
predicted_shares <- colMeans(ch_probs[, 1:4])
mae <- round(mean(abs(actual_shares - predicted_shares)) * 100, 2)
# Summary table
data.frame(
Alternative = c("A", "B", "C", "None"),
Actual = round(actual_shares * 100, 1),
Predicted = round(predicted_shares * 100, 1),
AbsError = round(abs(actual_shares - predicted_shares) * 100, 1)
) %>%
kable(
caption = "Holdout Validation: Actual vs Predicted Choice Shares (%)",
col.names = c("Alternative", "Actual (%)", "Predicted (%)", "|Error| (%)")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Alternative | Actual (%) | Predicted (%) | |Error| (%) | |
|---|---|---|---|---|
| A | A | 44.6 | 33.0 | 11.6 |
| B | B | 34.7 | 38.2 | 3.6 |
| C | C | 7.9 | 7.1 | 0.8 |
| None | None | 12.9 | 21.7 | 8.8 |
cat("Mean Absolute Error:", mae, "%\n")
## Mean Absolute Error: 6.19 %
data.frame(
Metric = c("Hit Rate", "Mean Hit Probability", "Mean Absolute Error"),
Value = c(
paste0(hit_rate, "%"),
paste0(mean_hit_prob, "%"),
paste0(mae, "%")
),
Benchmark = c("25% (chance)", "25% (chance)", "0% (perfect)")
) %>%
kable(caption = "Predictive Validation Summary") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Metric | Value | Benchmark |
|---|---|---|
| Hit Rate | 59.41% | 25% (chance) |
| Mean Hit Probability | 56.88% | 25% (chance) |
| Mean Absolute Error | 6.19% | 0% (perfect) |
# Map individual RAI to respondents
rai_with_demo <- as.data.frame(rai_individual)
rai_with_demo$record_id <- unique(cbc$record_id)
rai_gender <- rai_with_demo %>%
left_join(survey_matched %>% select(record_id, gender), by = "record_id") %>%
filter(!is.na(gender)) %>%
pivot_longer(cols = -c(record_id, gender),
names_to = "Attribute", values_to = "RAI") %>%
group_by(gender, Attribute) %>%
summarise(Mean = round(mean(RAI), 1), SD = round(sd(RAI), 1), .groups = "drop")
rai_gender %>%
pivot_wider(names_from = gender, values_from = c(Mean, SD)) %>%
kable(caption = "Attribute Importance by Gender (HB-MNL)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Attribute | Mean_Female | Mean_Male | SD_Female | SD_Male |
|---|---|---|---|---|
| Ads | 27.7 | 25.7 | 15.1 | 14.3 |
| Content Type | 16.6 | 15.7 | 9.6 | 10.4 |
| Price | 28.4 | 25.6 | 18.8 | 16.3 |
| Simultaneous Streams | 14.9 | 16.1 | 8.5 | 9.0 |
| Video Quality | 12.4 | 16.9 | 8.6 | 11.0 |
rai_gender %>%
ggplot(aes(x = Attribute, y = Mean, fill = gender)) +
geom_bar(stat = "identity", position = position_dodge(0.8),
alpha = 0.8, width = 0.7) +
geom_errorbar(aes(ymin = Mean - SD, ymax = Mean + SD),
position = position_dodge(0.8), width = 0.3) +
scale_fill_brewer(palette = "Set2", name = "Gender") +
labs(title = "Attribute Importance by Gender",
x = "", y = "RAI (%)") +
theme_bw(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "bottom",
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 20, hjust = 1)
)
rai_ott <- rai_with_demo %>%
left_join(survey_matched %>% select(record_id, ott_use), by = "record_id") %>%
filter(!is.na(ott_use)) %>%
mutate(ott_group = ifelse(ott_use == "No", "Non-user", "User")) %>%
pivot_longer(cols = -c(record_id, ott_use, ott_group),
names_to = "Attribute", values_to = "RAI") %>%
group_by(ott_group, Attribute) %>%
summarise(Mean = round(mean(RAI), 1), SD = round(sd(RAI), 1), .groups = "drop")
rai_ott %>%
pivot_wider(names_from = ott_group, values_from = c(Mean, SD)) %>%
kable(caption = "Attribute Importance by OTT Usage (HB-MNL)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Attribute | Mean_User | SD_User |
|---|---|---|
| Ads | 26.7 | 14.6 |
| Content Type | 16.1 | 10.0 |
| Price | 26.9 | 17.5 |
| Simultaneous Streams | 15.5 | 8.7 |
| Video Quality | 14.7 | 10.1 |
We simulate market shares for five product configurations that represent different positioning strategies in the OTT market.
# Product configurations (effects coded)
# Coding reminder:
# ads: ref=With Ads(1), Limited Ads(2), Ad-free(3)
# quality: ref=HD 720p(1), Full HD(2), 4K+HDR(3)
# streams: ref=1 screen(1), 2 screens(2), 4 screens(3)
# content: ref=Licensed(1), Orig&Lic(2), Exclusive(3)
# Effects coding: included level=1, ref level=-1 for both dummies
products <- data.frame(
Name = c(
"Budget Basic",
"Mid-Tier (Netflix Standard)",
"Premium All-In",
"Ad-Supported Value",
"Family Plan",
"None (Opt-out)"
),
Description = c(
"Cheapest, ads, basic quality, 1 screen, licensed",
"Mid-price, limited ads, Full HD, 2 screens, mixed content",
"Top-tier, ad-free, 4K, 4 screens, exclusive originals",
"Low price, with ads, Full HD, 2 screens, mixed content",
"Mid-high price, ad-free, Full HD, 4 screens, mixed content",
"Consumer opts out of all options"
),
# Effects-coded dummies
ads_2 = c(-1, 1, -1, -1, -1, 0),
ads_3 = c(-1, -1, 1, -1, 1, 0),
quality_2 = c(-1, 1, -1, 1, 1, 0),
quality_3 = c(-1, -1, 1, -1, -1, 0),
streams_2 = c(-1, 1, -1, 1, -1, 0),
streams_3 = c(-1, -1, 1, -1, 1, 0),
content_2 = c(-1, 1, -1, 1, 1, 0),
content_3 = c(-1, -1, 1, -1, -1, 0),
price_mc = c(4.99, 9.99, 14.99, 7.99, 12.99, 0) - mw_price,
NONE = c(0, 0, 0, 0, 0, 1),
stringsAsFactors = FALSE
)
products %>%
select(Name, Description) %>%
mutate(
Price = c("€4.99", "€9.99", "€14.99", "€7.99", "€12.99", "-"),
Ads = c("With Ads", "Limited Ads", "Ad-free", "With Ads", "Ad-free", "-"),
Quality = c("HD 720p", "Full HD", "4K HDR", "Full HD", "Full HD", "-"),
Streams = c("1", "2", "4", "2", "4", "-"),
Content = c("Licensed", "Orig & Lic", "Exclusive", "Orig & Lic", "Orig & Lic", "-")
) %>%
kable(caption = "Market Simulation: Product Configurations") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Name | Description | Price | Ads | Quality | Streams | Content |
|---|---|---|---|---|---|---|
| Budget Basic | Cheapest, ads, basic quality, 1 screen, licensed | €4.99 | With Ads | HD 720p | 1 | Licensed |
| Mid-Tier (Netflix Standard) | Mid-price, limited ads, Full HD, 2 screens, mixed content | €9.99 | Limited Ads | Full HD | 2 | Orig & Lic |
| Premium All-In | Top-tier, ad-free, 4K, 4 screens, exclusive originals | €14.99 | Ad-free | 4K HDR | 4 | Exclusive |
| Ad-Supported Value | Low price, with ads, Full HD, 2 screens, mixed content | €7.99 | With Ads | Full HD | 2 | Orig & Lic |
| Family Plan | Mid-high price, ad-free, Full HD, 4 screens, mixed content | €12.99 | Ad-free | Full HD | 4 | Orig & Lic |
| None (Opt-out) | Consumer opts out of all options |
|
|
|
|
|
The Family Plan captures the largest market share. What happens to the remaining products if a competitor discontinues this offering — or if it is not available in a given market?
# Remove "Family Plan" (product 5)
products_reduced <- products[-5, ]
products_matrix_red <- as.matrix(products_reduced[,
c("ads_2", "ads_3", "quality_2", "quality_3",
"streams_2", "streams_3", "content_2", "content_3", "price_mc", "NONE")])
ind_utils_red <- betas_ind %*% t(products_matrix_red)
ind_probs_red <- t(apply(ind_utils_red, 1, mnl_prob))
shares_red <- round(colMeans(ind_probs_red) * 100, 1)
data.frame(
Product = products_reduced$Name,
`With Family Plan` = round(shares_hb[-5], 1),
`Without Family Plan` = shares_red,
Change = round(shares_red - shares_hb[-5], 1),
check.names = FALSE
) %>%
kable(caption = "Scenario: Market Shares Without Family Plan (HB-MNL)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Product | With Family Plan | Without Family Plan | Change | |
|---|---|---|---|---|
| Budget Basic | Budget Basic | 1.2 | 1.5 | 0.3 |
| Mid-Tier (Netflix Standard) | Mid-Tier (Netflix Standard) | 9.9 | 14.5 | 4.6 |
| Premium All-In | Premium All-In | 13.2 | 23.9 | 10.7 |
| Ad-Supported Value | Ad-Supported Value | 3.8 | 6.4 | 2.6 |
| None (Opt-out) | None (Opt-out) | 44.6 | 53.6 | 9.0 |
data.frame(
Finding = c(
"Price dominates choice",
"Ads are the second-most important attribute",
"Content mix beats exclusivity",
"Quality and streams are secondary",
"Moderate opt-out rate",
"Substantial preference heterogeneity"
),
Detail = c(
"Price is the most important attribute, consistent with the price-sensitive profile of our sample.",
"Ad-free is strongly preferred; WTP for removing ads is among the highest across all attributes.",
"'Originals & Licensed' is preferred over 'Licensed Only' and 'Exclusive Originals'. A mixed catalog resonates better than exclusivity alone.",
"Video Quality and Simultaneous Streams each account for roughly 13-16% of importance.",
paste0(none_chosen_pct, "% of choices were 'None', suggesting most respondents would subscribe if the offering is reasonable."),
"Large SDs in individual RAI scores show that preferences vary considerably across respondents."
)
) %>%
kable(caption = "Summary of Key Findings") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Finding | Detail |
|---|---|
| Price dominates choice | Price is the most important attribute, consistent with the price-sensitive profile of our sample. |
| Ads are the second-most important attribute | Ad-free is strongly preferred; WTP for removing ads is among the highest across all attributes. |
| Content mix beats exclusivity | ‘Originals & Licensed’ is preferred over ‘Licensed Only’ and ‘Exclusive Originals’. A mixed catalog resonates better than exclusivity alone. |
| Quality and streams are secondary | Video Quality and Simultaneous Streams each account for roughly 13-16% of importance. |
| Moderate opt-out rate | 18.4% of choices were ‘None’, suggesting most respondents would subscribe if the offering is reasonable. |
| Substantial preference heterogeneity | Large SDs in individual RAI scores show that preferences vary considerably across respondents. |
Tiered pricing matters. Price sensitivity dominates choice. Platforms should maintain a clear tier ladder — a budget entry point (~€5), a mid-tier (~€8-10), and a feature-rich option at higher price.
Ad-free commands a premium. Consumers value ad removal almost as much as the price itself. Higher-priced ad-free tiers are justified by the data.
Content breadth over exclusivity. “Originals & Licensed” is consistently the top content preference. Over-investing in exclusive originals at the expense of a broad catalog does not pay off.
The “Family Plan” works. An ad-free, multi-screen, mixed-content offering at €12.99 dominates the simulation with the highest share, driven by shared household viewing.
Budget tiers need substance. A low price with ads, basic quality, and limited content loses to mid-tier options and even to opt-out. The cheapest tier must still deliver on content.
Preferences vary across respondents. The large SDs in individual-level RAI suggest different consumer segments respond to different positioning — some are price-driven, others prioritize ad-free or content quality.